// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![no_std]

//! Simple driver for the SHA256 hardware in the LPC55S, originally developed
//! for RFD374 stage0's DICE CDI updates, but generalized here for other
//! purposes. Its stage0 heritage manifests in two main ways: first, it's very
//! limited; second, it's got some possibly-surprising `inline` directives to
//! minimize generated code size.
//!
//! The SHA256 implementation has passed a bank of tests generated by
//! `sha256sum` on Linux. The HMAC implementation correctly generates the
//! results for the subset of test vectors in RFC 4231 that are consistent with
//! the system's limitations (below). (As well as some manually generated test
//! cases.) The stage0 codebase contains a test suite for its version of this
//! code; because the code isn't currently shared, it's always possible they've
//! diverged.
//!
//! # Assumptions and limitations
//!
//! This is a _deliberately_ very limited implementation, to save space and
//! eliminate corner cases we'd otherwise have to think about.
//!
//! First, the SHA256 implementation:
//!
//! - Input is in units of 32-bit words only. Because of how SHA padding works,
//!   this means you can't compute the correct SHA256 for (say) a 3-byte string.
//!   However, the types used in these APIs will also prevent you from
//!   _attempting_ to compute that SHA256, so this seems ok.
//!
//! - We're doing SHA256 _only,_ so block and hash sizes are fixed by compile
//!   time constants, and input and output is in fixed-length arrays defined by
//!   those constants.
//!
//! HMAC:
//!
//! - We only implement HMAC-SHA256 without truncation. (Truncation would be
//!   easy to add outside this module if you want it.)
//!
//! - Keys are always exactly one SHA256 digest in length, which is to say, 256
//!   bits. This eliminates some corner cases and conditionals from the key
//!   processing code, not to mention a bunch of bounds checks.
//!
//! - Keys are delivered in blocks of 32-bit words to ensure alignment
//!   (specifically, as fixed-length arrays of `u32`).
//!
//! - Data is also delivered in streams of 32-bit words, but can be any length.
//!
//! - For implementation convenience, data must be a single contiguous block of
//!   32-bit words that can be addressed by one slice, which in theory limits
//!   the maximum count to `isize::MAX`. In practice address space limitations
//!   on our processor limit it to less than that (about 1 GiB at a time).
//!   (This restriction would be straightforward to lift if required.)

use core::num::Wrapping;
use userlib::{sys_irq_control, sys_recv_notification};

// These constants describe intrinsic properties of the SHA256 algorithm and
// should not be changed.
const WORDS_PER_BLOCK: usize = 512 / 32; // which is to say, 16
const WORDS_PER_HASH: usize = 256 / 32; // which is to say, 8

// It's also convenient to have one as Wrapping u64:
const WORDS_PER_BLOCK64: Wrapping<u64> = Wrapping(WORDS_PER_BLOCK as u64);

/// State we maintain for an ongoing hash operation.
pub struct Hasher<'a> {
    engine: &'a lpc55_pac::hashcrypt::RegisterBlock,
    /// The number of words that have been fed to `update` so far. (During
    /// execution of `finish` this also counts padding words.)
    ///
    /// We use this for two purposes:
    /// 1. Keeping track of where we are in the current block.
    /// 2. Writing the length of data in bits to the final block as required by
    ///    SHA256.
    ///
    /// In both of these cases, wrapping is fine -- SHA256 actually specifies
    /// the count as wrapping at 64 bits. So using a `Wrapping<u64>` saves some
    /// overflow checks.
    ///
    /// This is `u64` instead of `usize` because you can call `update`
    /// repeatedly with new slices, meaning the number stored here can easily
    /// exceed `usize::MAX` on a 32-bit platform.
    word_count: Wrapping<u64>,
    /// Notification mask corresponding to the HASHCRYPT interrupt in whatever
    /// code is hosting this driver. Used to sleep when things aren't ready.
    notification_mask: u32,
}

impl<'a> Hasher<'a> {
    /// Starts a new SHA256 hash operation, initializing the `HASHCRYPT` unit.
    ///
    /// You are _strongly advised_ to have reset the HASHCRYPT unit just before
    /// this, lest it contain nonsense left over from a previous hash. (In
    /// particular, the boot ROM leaves nonsense in it, so it needs to be reset
    /// at least once after startup.)
    ///
    /// This driver doesn't reset the peripheral itself so that we don't make
    /// assumptions about how reset is reached (e.g. by an IPC).
    ///
    /// `notification_mask` is the bitmask corresponding to the HASHCRYPT IRQ
    /// routing to your task. If you'd rather busywait, pass 0.
    #[inline(never)]
    pub fn begin(
        engine: &'a lpc55_pac::hashcrypt::RegisterBlock,
        notification_mask: u32,
    ) -> Self {
        // Put the multi-function thingy into SHA2 256 mode and start a new
        // hash. (The UM is not entirely clear whether setting the mode and
        // starting the hash in a single write is legal, but it works and NXP's
        // code appears to do the same thing.)
        engine
            .ctrl
            .write(|w| w.mode().sha2_256().new_hash().start());

        Self {
            engine,
            word_count: Wrapping(0),
            notification_mask,
        }
    }

    /// Extends the current hash-in-progress with the given `data`,
    /// exclusive-ORing each word with `mask.
    ///
    /// `data` may cross block boundaries, be a partial block, etc. It will be
    /// concatenated with the `data` passed to any other `update` call.
    ///
    /// In most cases you want a `mask` of 0; the parameter is provided because
    /// it's useful in certain HMAC operations and using the same routine for
    /// both cases saves some space.
    #[inline(never)]
    pub fn update(&mut self, data: &[u32], mask: u32) {
        for &word in data {
            self.load_word(word, mask);
        }
    }

    /// Completes the SHA256 hash.
    ///
    /// You should seriously consider flipping the HASHCRYPT unit back into
    /// reset after calling this, to avoid leaking data about
    /// whatever-it-was-you-were-just-doing. Or don't; we're not the cops.
    #[inline(never)]
    pub fn finish(mut self) -> [u32; WORDS_PER_HASH] {
        // The SHA-256 hardware works in units of 16 words / 64 bytes / 512
        // bits, called blocks. After the actual `data` goes into the hardware,
        // we have to finish it off with something called Merkle-Damgård (MD)
        // padding. The variety of MD padding specified in the SHA-256 spec is:
        //
        // - Add a 1 bit.
        // - Add enough 0 bits for the current block to reach 448 (mod 512)
        //   bits.  (That is, until there are only 64 bits left in the block.)
        //   If the block is already past 448 bits by the time we added the 1,
        //   this means finishing out this block, and then starting a new one
        //   with 448 zeros.
        // - Add the length of the original data, _in bits,_ as a 64-bit
        //   big-endian integer to round out the final block to 512 bits.
        // - Add the final block(s) to the digest.
        //
        // (If you're curious, this construction provides a defense against
        // messages of slightly different lengths hashing to the same value.)
        //
        // Since we move data in 32-bit words only, the padding process is
        // slightly simplified here:
        //
        // - Append a word with only its MSB set.
        // - Append words of zeros until two words of space remain in the block.
        //   This may require starting a new block.
        // - Append the high word of the data length in bits, and the low word,
        //   in that order, as big-endian integers.

        let word_count_before_padding = self.word_count;

        // We want the PAD bit to be in the MSB of the first byte added to the
        // digest, which, due to us being little-endian, means our pad value is:
        const PAD: u32 = 0x80_00_00_00_u32.swap_bytes();

        self.load_word(PAD, 0);
        // Extend with zeros until we're aligned properly for the final length.
        while self.word_count % WORDS_PER_BLOCK64
            != WORDS_PER_BLOCK64 - Wrapping(2)
        {
            self.load_word(0, 0);
        }
        // We are now 14 words into a block, no synchronization is necessary, we
        // just need to load the length of the pre-padded data in bits. As with
        // PAD above, since these aren't round-tripping through little-endian
        // memory, we wind up having to swap their bytes:
        let Wrapping(length) = word_count_before_padding * Wrapping(32);
        self.load_word(u32::swap_bytes((length >> 32) as u32), 0);
        self.load_word(u32::swap_bytes(length as u32), 0);

        // Wait for our result!
        while self.engine.status.read().digest().is_not_ready() {
            if self.notification_mask != 0 {
                // Permit the hardware to generate an IRQ on DIGEST
                self.engine.intenset.write(|w| w.digest().set_bit());

                // Wait for it!
                sys_irq_control(self.notification_mask, true);
                sys_recv_notification(self.notification_mask);

                // Turn it back off lest it spam us in the future.
                self.engine.intenclr.write(|w| w.digest().set_bit());
            }
        }

        // The result arrives in registers called digest0..digest7, which the
        // PAC calls digest0[0] .. digest0[7] for some reason.
        let mut result = [0; WORDS_PER_HASH];
        for (dest, reg) in result.iter_mut().zip(&self.engine.digest0) {
            *dest = reg.read().bits().swap_bytes();
        }

        result
    }

    /// Utility factor for making sure we synchronize with the device at the
    /// start of each 16-word input block.
    #[inline(never)]
    fn load_word(&mut self, word: u32, mask: u32) {
        if self.word_count % WORDS_PER_BLOCK64 == Wrapping(0) {
            // Wait for the controller to be interested in what we have to say.
            while self.engine.status.read().waiting().is_not_waiting() {
                if self.notification_mask != 0 {
                    // Permit the hardware to generate an IRQ on WAITING
                    self.engine.intenset.write(|w| w.waiting().set_bit());

                    // Wait for it!
                    sys_irq_control(self.notification_mask, true);
                    sys_recv_notification(self.notification_mask);

                    // Turn it back off lest it spam us in the future.
                    self.engine.intenclr.write(|w| w.waiting().set_bit());
                }
            }
        }
        self.engine
            .indata
            .write(|w| unsafe { w.data().bits(word ^ mask) });
        self.word_count += Wrapping(1);
    }
}
